In [1]:
    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
    
Then, read the (sample) input tables for blocking purposes.
In [2]:
    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
    
In [3]:
    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
    
In [4]:
    
atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)
    
In [5]:
    
atypes1.keys()
    
    Out[5]:
In [6]:
    
atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']
    
    Out[6]:
In [7]:
    
atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']
    
    Out[7]:
In [8]:
    
atypes1['address'], atypes2['address']
    
    Out[8]:
In [9]:
    
atypes1['address'] = 'str_bt_1w_5w'
atypes2['address'] = 'str_bt_1w_5w'
    
In [10]:
    
block_c = em.get_attr_corres(A, B)
    
In [11]:
    
block_c.keys()
    
    Out[11]:
In [12]:
    
id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])
    
    Out[12]:
In [13]:
    
block_c['corres']
    
    Out[13]:
In [14]:
    
block_c['corres'] = [('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]
    
In [23]:
    
# for blocking
tok = em.get_tokenizers_for_blocking() 
# for matching 
#tok = em.get_tokenizers_for_matching()
    
In [16]:
    
tok
    
    Out[16]:
In [22]:
    
#for blocking
sim = em.get_sim_funs_for_blocking()
#for matching
#sim = em.get_sim_funs_for_matching()
    
In [18]:
    
sim
    
    Out[18]:
In [19]:
    
feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)
    
In [20]:
    
feature_table[feature_table.left_attribute == 'address']
    
    Out[20]:
In [21]:
    
type(feature_table)
    
    Out[21]: